Given the results of the analysis of the Fluidigm data, here I explore the role of the V intercept in a more complex data (Allen).
We want to check if the intercept acts as a size factor here too and, if true, if W capture some interesting biology.
Interestingly, in this more complex dataset, both intercepts are needed to get a clear picture of the data in two dimensions.
As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were part of the “core” clusters. I will color-code the cells by either known cell type or by inferred cluster (inferred in the original study).
Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("allen")
allen_core <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
filter <- apply(assay(allen_core)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 11545
To speed up the computations, I will focus on the top 1,000 most variable genes.
allen_core <- allen_core[filter,]
core <- assay(allen_core)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(1, 2))
bio <- as.factor(colData(allen_core)$driver_1_s)
cl <- as.factor(colData(allen_core)$Primary.Type)
detection_rate <- colSums(core>0)
coverage <- colSums(core)
library(EDASeq)
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bio], pch=19, main="PCA of log-counts, centered not scaled")
legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts, centered not scaled")
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.0000000 0.2269928 0.2543772 0.0875427
## PC2 0.2269928 1.0000000 -0.4918824 -0.1161663
## detection_rate 0.2543772 -0.4918824 1.0000000 0.5275845
## coverage 0.0875427 -0.1161663 0.5275845 1.0000000
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 103.392 27.817 55.820
Plot the results with cells colored according to their biological condition.
par(mfrow=c(1, 2))
plot(zinb_Vall@W, col=cols[bio], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_Vall@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2", main="Both V intercepts")
legend("topright", levels(cl), fill=cols2)
One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 0.03507182 -0.08456147 0.3600626
## W2 0.03507182 1.00000000 -0.36770620 0.3713183
## gamma_mu -0.08456147 -0.36770620 1.00000000 -0.3353419
## gamma_pi 0.36006262 0.37131830 -0.33534190 1.0000000
## detection_rate -0.35181563 -0.34016219 0.37149947 -0.9939488
## coverage -0.05237705 -0.05644585 0.85040460 -0.4845843
## detection_rate coverage
## W1 -0.3518156 -0.05237705
## W2 -0.3401622 -0.05644585
## gamma_mu 0.3714995 0.85040460
## gamma_pi -0.9939488 -0.48458428
## detection_rate 1.0000000 0.52758451
## coverage 0.5275845 1.00000000
\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
## user system elapsed
## 121.594 30.205 60.748
par(mfrow=c(1, 2))
plot(zinb_Vnone@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_Vnone@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 detection_rate coverage
## W1 1.000000000 0.005234128 -0.4111624 -0.1333461
## W2 0.005234128 1.000000000 0.8734817 0.5863095
## detection_rate -0.411162366 0.873481664 1.0000000 0.5275845
## coverage -0.133346120 0.586309465 0.5275845 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))
print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
## user system elapsed
## 121.534 32.194 63.892
par(mfrow=c(1, 2))
plot(zinb_Vmu@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vmu@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_mu detection_rate coverage
## W1 1.00000000 -0.05849243 0.0479402 -0.2684243 -0.1091023
## W2 -0.05849243 1.00000000 0.5594919 0.8888512 0.3763285
## gamma_mu 0.04794020 0.55949188 1.0000000 0.5882734 0.9234284
## detection_rate -0.26842434 0.88885119 0.5882734 1.0000000 0.5275845
## coverage -0.10910231 0.37632849 0.9234284 0.5275845 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
## user system elapsed
## 164.088 44.875 87.087
par(mfrow=c(1, 2))
plot(zinb_Vpi@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_Vpi@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=cols[bio])
print(cor(df, method="spearman"))
## W1 W2 gamma_pi detection_rate
## W1 1.00000000 0.1300077 0.09476535 -0.09798919
## W2 0.13000772 1.0000000 -0.37655658 0.27521553
## gamma_pi 0.09476535 -0.3765566 1.00000000 -0.98666386
## detection_rate -0.09798919 0.2752155 -0.98666386 1.00000000
## coverage 0.15573929 -0.2577952 -0.47891212 0.52758451
## coverage
## W1 0.1557393
## W2 -0.2577952
## gamma_pi -0.4789121
## detection_rate 0.5275845
## coverage 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19, col=cols[bio])
plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19, col=cols[bio])
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3)))
## user system elapsed
## 125.764 30.668 63.766
pairs(zinb_3@W, col=cols[bio], pch=19)
pairs(zinb_3@W, col=cols2[cl], pch=19)
## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="logcounts_allen.csv")
core <- assay(allen_core)
dim(core)
## [1] 11545 285
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 1147.669 169.697 568.605
par(mfrow=c(1, 2))
plot(zinb_all@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
plot(zinb_all@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
Interestingly, when using all the genes, things don’t work nicely anymore. I wonder if the difference is the selection of the most variable genes, rather than the complexity of the data.
par(mfrow=c(1, 2))
zifa_res <- read.csv("zifa_allen_full.csv", header=FALSE)
plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")
par(mfrow=c(1, 2))
zifa_res <- read.csv("zifa_allen.csv", header=FALSE)
plot(zifa_res, col=cols[bio], pch=19, xlab="Z1", ylab="Z2")
plot(zifa_res, col=cols2[cl], pch=19, xlab="Z1", ylab="Z2")